{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "d0a4e05f",
   "metadata": {},
   "source": [
    "## Data sourcing\n",
    "\n",
    "Source data from various source systems and ingest them using python code.\n",
    "\n",
    "1. Parquet files\n",
    "2. CSV files\n",
    "3. APIs\n",
    "4. RDBMS databases\n",
    "5. HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0c7de7f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# import modules\n",
    "import certifi\n",
    "import json\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import sqlite3\n",
    "import urllib3\n",
    "from urllib3 import request\n",
    "from unicodedata import normalize"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "746175dc",
   "metadata": {},
   "source": [
    "### Sourcing Parquet data"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f8c37682",
   "metadata": {},
   "source": [
    "Please visit the url https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "132b5dc6",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read data from the Parquet file. We use pandas read_parquet method for ease and speed.\n",
    "df_parquet = pd.read_parquet(\"yellow_tripdata_2022-01.parquet\")\n",
    "df_parquet.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e8b2f93c",
   "metadata": {},
   "source": [
    "### Sourcing CSV data "
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c9a306a2",
   "metadata": {},
   "source": [
    "Please visit the url https://data.cityofnewyork.us/resource/h9gi-nx95.csv?$limit=500\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "56007303",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read data from the CSV file. We use pandas read_csv method for ease and speed.\n",
    "df_csv = pd.read_csv(\"h9gi-nx95.csv\")\n",
    "df_csv.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e2da41e6",
   "metadata": {},
   "source": [
    "### Sourcing data from APIs"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a6fc77b5",
   "metadata": {},
   "source": [
    "Please make sure to install the certifi library using - pipenv install certifi"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9a061b34",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get api data from url\n",
    "url = 'https://data.cityofnewyork.us/resource/h9gi-nx95.json?$limit=500'\n",
    "\n",
    "# Check if API is available to retrive the data\n",
    "apt_status = http.request('GET', url).status\n",
    "print(apt_status)\n",
    "if apt_status == 200:\n",
    "    # Sometimes we get certificate error . We shoul never silence this error as this may cause a securirty threat.\n",
    "    # Create a Pool manager that can be used to read the API response \n",
    "    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',ca_certs=certifi.where())\n",
    "    data = json.loads(http.request('GET', url).data.decode('utf-8'))\n",
    "    df_api = pd.json_normalize(data)\n",
    "else:\n",
    "    df_api = pd.Dataframe()\n",
    "df_api.head(10)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "282fc78f",
   "metadata": {},
   "source": [
    "### Sourcing Data from RDBMS tables"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0a384b2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read sqlite query results into a pandas DataFrame\n",
    "with sqlite3.connect(\"movies.sqlite\") as conn:\n",
    "    df = pd.read_sql(\"SELECT * from movies\", conn)\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab324ee5",
   "metadata": {},
   "source": [
    "# Sourcing data from Webpages"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6ff92439",
   "metadata": {},
   "source": [
    "Please visit the url https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "97c5fffd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# get data from url\n",
    "df_html = pd.read_html('https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)',match = 'by country')\n",
    "# Let's see how many tables are there with tage ' by county'\n",
    "print(len(df_html)) # There are 4 tables\n",
    "# Let's see the first table\n",
    "df_html[0]"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
